// Anime4K_Upscale_CNN_x2_L
// 移植自 https://github.com/bloc97/Anime4K/blob/master/glsl/Upscale/Anime4K_Upscale_CNN_x2_L.glsl

//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME Anime4K_Upscale_1


//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex2;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex3;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex4;

//!SAMPLER
//!FILTER POINT
SamplerState sam;

//!SAMPLER
//!FILTER LINEAR
SamplerState sam1;


//!PASS 1
//!DESC Conv-4x3x3x3
//!IN INPUT
//!OUT tex1, tex2
//!BLOCK_SIZE 16
//!NUM_THREADS 64

void Pass1(uint2 blockStart, uint3 threadId) {
	uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}
	float2 inputPt = GetInputPt();

	uint i, j;

	float3 src[4][4];
	[unroll]
	for (i = 0; i <= 2; i += 2) {
		[unroll]
		for (j = 0; j <= 2; j += 2) {
			float2 tpos = (gxy + uint2(i, j)) * inputPt;
			const float4 sr = INPUT.GatherRed(sam, tpos);
			const float4 sg = INPUT.GatherGreen(sam, tpos);
			const float4 sb = INPUT.GatherBlue(sam, tpos);

			// w z
			// x y
			src[i][j] = float3(sr.w, sg.w, sb.w);
			src[i][j + 1] = float3(sr.x, sg.x, sb.x);
			src[i + 1][j] = float3(sr.z, sg.z, sb.z);
			src[i + 1][j + 1] = float3(sr.y, sg.y, sb.y);
		}
	}

	[unroll]
	for (i = 1; i <= 2; ++i) {
		[unroll]
		for (j = 1; j <= 2; ++j) {
			uint2 destPos = gxy + uint2(i - 1, j - 1);

			if (i != 1 || j != 1) {
				if (destPos.x >= inputSize.x || destPos.y >= inputSize.y) {
					continue;
				}
			}

			float4 target1 = mul(src[i - 1][j - 1], float3x4(-0.01802505, -0.06508559, 0.0017542128, -0.25521114, -0.024155967, 0.07601142, 0.07508073, -0.69212615, 0.06438325, 0.07916419, -0.07266247, -0.17089996));
			target1 += mul(src[i - 1][j], float3x4(-0.18881685, 0.063188724, 0.08637344, -0.20066689, -0.22774473, -0.10913083, -0.048009537, -0.27475306, -0.15950447, -0.027433012, 0.030303264, 0.018863251));
			target1 += mul(src[i - 1][j + 1], float3x4(0.19171959, 0.028070524, -0.09780952, 0.057611514, 0.26147488, 0.07180017, 0.09667393, 0.008605127, 0.011190245, 0.040944707, -0.025871381, -0.011468774));
			target1 += mul(src[i][j - 1], float3x4(-0.08601777, 0.2180965, -0.052060187, -0.08091977, 0.3995336, 0.21213862, 0.105202965, 0.010929526, -0.080743685, -0.040439352, -0.07998862, 0.08096829));
			target1 += mul(src[i][j], float3x4(-0.08608087, -0.10902571, 0.5245411, 0.53331816, -0.5507893, 0.6141555, 0.9744618, 0.89056116, 0.081408836, 0.28096125, 0.13647869, 0.026274862));
			target1 += mul(src[i][j + 1], float3x4(0.13985278, -0.29691598, -0.26792645, -0.042485088, 0.27941233, -0.65306807, -0.360506, 0.03810829, 0.09663724, -0.26655033, 0.04372338, 0.063264176));
			target1 += mul(src[i + 1][j - 1], float3x4(0.18246013, -0.070464276, 0.04686214, 0.059332885, 0.16861221, 0.022623831, -0.09161491, 0.07130491, 0.15691474, 0.095743366, 0.059467375, -0.009469478));
			target1 += mul(src[i + 1][j], float3x4(-0.07830576, 0.17082681, -0.013175545, -0.035535168, 0.1573564, -0.16532823, -0.49614525, -0.050994795, 0.027053844, -0.20359018, 0.08111269, 0.018715343));
			target1 += mul(src[i + 1][j + 1], float3x4(-0.060515754, 0.13169582, -0.15245132, -0.045724656, -0.50778043, -0.11690067, -0.08320143, 0.01093529, -0.1954136, 0.07388989, -0.1360143, -0.023233788));
			target1 += float4(-0.0054077627, -0.15644358, 0.06612042, 0.014728144);

			float4 target2 = mul(src[i - 1][j - 1], float3x4(0.12039797, 0.04008252, 0.073074624, -0.11763173, 0.091902, -0.040129073, 0.009170759, -0.10760076, -0.032387916, 0.07807673, -6.477932e-05, 0.040181372));
			target2 += mul(src[i - 1][j], float3x4(0.11585734, -0.0078019407, 0.063107856, 0.09835168, -0.029397847, 0.12520139, 0.078661725, -0.12675259, 0.05563671, -0.058422342, 0.01478436, -0.08239175));
			target2 += mul(src[i - 1][j + 1], float3x4(-0.14991632, -0.0134848505, -0.38663143, -0.10859369, -0.012961176, -0.09099144, -0.19593886, -0.08396245, -0.02676463, -0.0066295485, 0.026225863, -0.014788537));
			target2 += mul(src[i][j - 1], float3x4(0.20830092, -0.07655779, 0.04518565, 0.16015635, 0.5536684, 0.09759215, -0.101477005, -0.5042874, 0.35608026, -0.15335238, 0.0796228, -0.05107349));
			target2 += mul(src[i][j], float3x4(-0.3968312, 0.091579735, -0.085623756, -0.16367513, -0.48967922, -0.08557767, 0.34444988, 1.0766053, -0.2158915, 0.22244956, -0.033804208, 0.25898156));
			target2 += mul(src[i][j + 1], float3x4(0.026278365, 0.1678205, -0.38981274, 0.16932413, -0.0146527905, 0.65131354, -0.08886725, 0.07017853, 0.012986331, 0.029087646, -0.034898676, 0.040925268));
			target2 += mul(src[i + 1][j - 1], float3x4(0.07614263, 0.015624113, -0.054716587, -0.04951801, 0.016731577, -0.08749623, 0.07815944, -0.015630174, -0.08723511, 0.077201165, -0.12674089, -0.08418575));
			target2 += mul(src[i + 1][j], float3x4(-0.07938198, -0.4056014, 0.26949093, 0.067213245, -0.47538033, -0.7786735, -0.25821188, 0.029314762, -0.1799233, -0.31132734, 0.17210929, -0.018579468));
			target2 += mul(src[i + 1][j + 1], float3x4(0.06732179, 0.18373649, -0.00039802346, 0.006567155, 0.36993378, 0.20254396, 0.12314272, -0.07356931, 0.12402926, 0.122744165, -0.07482636, 0.011531308));
			target2 += float4(-0.001029383, 0.0058537456, 0.38202196, -0.028818231);

			tex1[destPos] = target1;
			tex2[destPos] = target2;
		}
	}
}


//!PASS 2
//!DESC Conv-4x3x3x16
//!IN tex1, tex2
//!OUT tex3, tex4
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass2(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [ a, d, g ]
	// [ b, e, h ]
	// [ c, f, i ]
	float4 a1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	float4 b1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 c1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 d1 = tex1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 e1 = tex1.SampleLevel(sam, pos, 0);
	float4 f1 = tex1.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 g1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 h1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 i1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	float4 na1 = max(-a1, 0);
	float4 nb1 = max(-b1, 0);
	float4 nc1 = max(-c1, 0);
	float4 nd1 = max(-d1, 0);
	float4 ne1 = max(-e1, 0);
	float4 nf1 = max(-f1, 0);
	float4 ng1 = max(-g1, 0);
	float4 nh1 = max(-h1, 0);
	float4 ni1 = max(-i1, 0);

	a1 = max(a1, 0);
	b1 = max(b1, 0);
	c1 = max(c1, 0);
	d1 = max(d1, 0);
	e1 = max(e1, 0);
	f1 = max(f1, 0);
	g1 = max(g1, 0);
	h1 = max(h1, 0);
	i1 = max(i1, 0);

	float4 a2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	float4 b2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 c2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 d2 = tex2.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 e2 = tex2.SampleLevel(sam, pos, 0);
	float4 f2 = tex2.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 g2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 h2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 i2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	float4 na2 = max(-a2, 0);
	float4 nb2 = max(-b2, 0);
	float4 nc2 = max(-c2, 0);
	float4 nd2 = max(-d2, 0);
	float4 ne2 = max(-e2, 0);
	float4 nf2 = max(-f2, 0);
	float4 ng2 = max(-g2, 0);
	float4 nh2 = max(-h2, 0);
	float4 ni2 = max(-i2, 0);

	a2 = max(a2, 0);
	b2 = max(b2, 0);
	c2 = max(c2, 0);
	d2 = max(d2, 0);
	e2 = max(e2, 0);
	f2 = max(f2, 0);
	g2 = max(g2, 0);
	h2 = max(h2, 0);
	i2 = max(i2, 0);

	float4 target1 = mul(a1, float4x4(0.29632062, 0.029882489, -0.14098296, -0.18291497, 0.2815667, -0.053558454, 0.10369031, -0.080831036, 0.06955536, -0.020830285, -0.2950748, -0.12086926, -0.023612294, -0.08531449, 0.0390424, 0.011767062));
	target1 += mul(b1, float4x4(0.094862625, 0.10448947, -0.11741873, 0.18871774, 0.15733057, 0.029140847, 0.16012338, 0.24601823, -0.2918398, 0.09956984, 0.04739516, 0.24494317, -0.021442452, 0.014309354, -0.04935703, 0.13617574));
	target1 += mul(c1, float4x4(0.1303212, -0.06284708, -0.23948738, -0.05782472, -0.024363542, -0.11251477, -0.07354371, -0.023819327, 0.13228065, 0.05485702, -0.068491034, 0.1938021, -0.046649717, -0.07752723, 0.031408865, 0.026424367));
	target1 += mul(d1, float4x4(0.4484274, 0.10068111, -0.122855306, -0.0649749, 0.09444103, 0.119198084, -0.22159275, 0.08555494, -0.18764949, -0.16276829, 0.20377621, 0.13764401, -0.027324807, 0.1294057, 0.035147414, 0.010751216));
	target1 += mul(e1, float4x4(-0.23321865, -0.029401073, 0.064655, 0.28420436, 0.051707894, -0.27915454, 0.0037779005, -0.18335588, 0.2275033, -0.25828046, -0.005339017, -0.4358691, 0.30487007, -0.044198595, 0.08767978, -0.04610031));
	target1 += mul(f1, float4x4(-0.044556934, -0.06692216, -0.21124081, -0.1394273, -0.24693833, -0.11292927, -0.026020816, 0.08460799, -0.11183761, -0.020012703, 0.29486617, 0.3726646, -0.054298244, 0.0045891847, -0.23693855, -0.02848257));
	target1 += mul(g1, float4x4(0.04877764, -0.011964396, -0.08024895, -0.12175299, -0.015131821, -0.048996765, 0.10887477, -0.053192, 0.09514728, 0.00918332, -0.08621957, -0.04674572, -0.06000914, -0.060330987, 0.20702218, -0.015094036));
	target1 += mul(h1, float4x4(-0.1651207, -0.024862131, 0.37494823, 0.10404018, -0.2891288, -0.38082635, 0.3194514, -0.055852838, -0.073994, -0.1566444, 0.08473525, 0.15672058, -0.08205729, 0.33623922, 0.2524126, 0.022668727));
	target1 += mul(i1, float4x4(0.083043605, -0.2989435, -0.14887308, 0.008989406, 0.01214123, -0.29387334, 0.015363028, -0.027485376, 0.13046521, 0.12814927, -0.05615426, -0.08612421, -0.3833321, 0.30930206, 0.3397905, 0.095533006));
	target1 += mul(a2, float4x4(-0.24895187, 0.033672538, 0.024642626, 0.049984645, 0.09217451, 0.14135426, -0.36297384, 0.10239864, 0.14611131, -0.022452299, 0.009058074, -0.18909958, -0.0049156225, 0.10530887, -0.06942197, -0.06450979));
	target1 += mul(b2, float4x4(0.14025575, 0.030043123, 0.07131405, -0.0675702, 0.17274232, 0.17896765, -0.4741036, 0.35670453, 0.03911085, 0.079387054, 0.04420335, -0.040182803, 0.078871965, 0.16641952, -0.101532914, -0.07683543));
	target1 += mul(c2, float4x4(0.09527535, -0.014889733, 0.003928801, -0.0144131305, 0.16238941, -0.073573746, -0.18349837, 0.057071213, -0.056580186, -0.048254594, -0.02551881, 0.13056543, 0.15681924, 0.11329136, 0.026496707, -0.15252273));
	target1 += mul(d2, float4x4(-0.20110673, -0.17727304, 0.16747122, -0.060028642, -0.03426759, 0.34928, -0.16542526, -0.03665969, -0.11838656, 0.050624777, -0.11309917, 0.17241085, 0.07320429, -0.19349192, 0.36426768, -0.09952723));
	target1 += mul(e2, float4x4(-0.44797856, 0.13166381, 0.19358242, -0.17328072, 0.277866, 0.2704677, -0.3897292, -0.09388379, -0.11871803, 0.059352446, 0.02593061, 0.1112078, -0.44574997, -0.20000082, -0.06038736, 0.11825317));
	target1 += mul(f2, float4x4(0.035831098, 0.03203473, 0.007270905, -0.2803515, -0.07803025, -0.044999845, 0.05503914, 0.02849373, -0.019534718, 0.109557584, -0.057618562, -0.22376212, 0.037630744, 0.07285683, -0.10327242, 0.15022281));
	target1 += mul(g2, float4x4(-0.22891119, -0.07055901, -0.06924297, 0.091851085, 0.079149276, 0.072282575, 0.1054971, -0.055115294, 0.15652373, -0.18508117, -0.044487055, -0.09513059, -0.007512581, 0.04595113, -0.16729085, -0.038352273));
	target1 += mul(h2, float4x4(-0.33190495, 0.024579005, 0.3356564, -0.24311046, -0.12634008, 0.08820384, -0.028739989, 0.0871991, -0.011547642, 0.07184339, 0.098442815, 0.13700496, -0.0665544, 0.18513693, 0.0956979, -0.15374076));
	target1 += mul(i2, float4x4(0.18756114, -0.21920492, 0.06407122, -0.24816798, -0.061943263, -0.054488074, -0.081633896, 0.0920547, -0.08880129, -0.18094197, 0.006789256, 0.094598, 0.17799775, -0.26764068, -0.02588948, 0.17095666));
	target1 += mul(na1, float4x4(-0.17086025, -0.035769157, -0.11523461, 0.20404252, -0.0398533, -0.19902702, 0.0072308285, -0.099984154, -0.1378846, -0.14236672, 0.40502122, 0.13498029, 0.057273205, -0.06250679, 0.035283897, 0.01703995));
	target1 += mul(nb1, float4x4(0.12786144, -0.23442458, -0.047862124, -0.12505807, -0.23584808, -0.064344, 0.07848756, 0.08764069, 0.34468293, -0.29927257, -0.18664125, -0.20938, 0.07196786, -0.0116679, 0.16299239, -0.076479614));
	target1 += mul(nc1, float4x4(0.08299449, -0.065809734, 0.12079292, -0.14469329, -0.0102624465, 0.04364353, 0.080220595, -0.025435027, 0.010015513, -0.03863331, 0.28750968, -0.07637218, 0.049731467, -0.062238537, 0.04295189, -0.028037619));
	target1 += mul(nd1, float4x4(-0.3443906, -0.04198392, -0.1612832, 0.2218389, -0.01519015, 0.093942635, -0.15996122, -0.00093763386, 0.024008257, 0.035953972, -0.15771267, -0.07148778, 0.044501238, -0.14950006, 0.087219894, -0.05831199));
	target1 += mul(ne1, float4x4(0.22099696, 0.117263354, -0.09403858, -0.20318906, 0.18389364, 0.13026148, 0.08169665, -0.014347075, -0.16660765, 0.026503418, 0.058967166, 0.26847002, -0.39771244, 0.36704144, 0.18202503, 0.15385705));
	target1 += mul(nf1, float4x4(-0.015089774, 0.120200686, 0.114723265, 0.12578037, 0.16593805, 0.040399004, 0.11654924, 0.0025250788, -0.2342627, 0.05865, -0.16680475, -0.06301523, 0.07027856, 0.18853764, 0.5213158, -0.3325365));
	target1 += mul(ng1, float4x4(-0.046818264, 0.04336355, 0.020795586, 0.09917639, -0.028305013, -0.14843199, 0.16168857, -0.18567303, 0.010514865, 0.06331522, 0.064430796, 0.034681283, 0.041065965, 0.06522019, 0.055882502, -0.015027999));
	target1 += mul(nh1, float4x4(0.18064371, -0.17903666, -0.24589087, 0.072444126, 0.13745096, -0.011904026, -0.19597653, 0.023044739, 0.01585197, -0.1759934, 0.2287553, -0.20533411, -0.2898527, 0.42946577, -0.07443011, 0.052736074));
	target1 += mul(ni1, float4x4(0.043961097, 0.10761276, -0.0034299751, 0.10491056, 0.004527805, 0.077598244, 0.077725716, 0.030323742, -0.20945124, -0.2235839, 0.05397613, 0.2072019, 0.008769854, 0.30513015, -0.104206346, -0.20982127));
	target1 += mul(na2, float4x4(0.0858221, -0.013858144, 0.11189161, -0.005996965, 0.13058044, 0.23954146, -0.29756296, 0.1801562, -0.034966737, 0.16073282, -0.06920962, 0.16596314, -0.06746436, 0.18948093, -0.17374122, 0.035474796));
	target1 += mul(nb2, float4x4(-0.23172139, -0.062111232, 0.028210253, -0.022090558, 0.2854972, 0.015097004, -0.2057132, -0.19835956, 0.14960685, 0.09755452, -0.17077617, -0.10259408, -0.11695073, 0.071428165, 0.18146773, 0.22237262));
	target1 += mul(nc2, float4x4(-0.008571818, 0.2160462, 0.20442474, 0.008256999, -0.0054347264, -0.036963176, -0.023052184, -0.27406418, 0.0155730015, 0.13230084, 0.028925262, -0.22739749, -0.07608481, -0.059095357, -0.17677523, 0.020464322));
	target1 += mul(nd2, float4x4(0.09186881, 0.10612606, -0.29759738, 0.04590405, 0.042567376, 0.77174157, -0.3156031, 0.22815126, -0.0745266, -0.08352131, -0.03825007, -0.14752272, 0.14866033, -0.07303083, 0.01180863, 0.12170875));
	target1 += mul(ne2, float4x4(-0.033268124, -0.08281718, 0.040441882, -0.05514993, 0.07650108, 0.5294327, -0.22229794, 0.049523506, 0.021545604, 0.03359928, 0.032059934, -0.0082821995, 0.036441952, -0.5097175, 0.38215104, 0.33496317));
	target1 += mul(nf2, float4x4(-0.18900043, 0.3539563, 0.25134736, 0.21487151, 0.22043267, 0.13304482, -0.17353508, -0.15632114, -0.067851126, -0.08173108, 0.0035601144, 0.16791934, -0.07563204, -0.07128694, 0.10757592, 0.26915342));
	target1 += mul(ng2, float4x4(0.1761959, 0.025833737, 0.09164757, -0.030622564, -0.22870748, 0.11929823, -0.24930833, 0.07446655, -0.17372188, -0.66051316, 0.15344264, 0.20571554, 0.0620721, 0.08122408, -0.0952373, 0.04144613));
	target1 += mul(nh2, float4x4(-0.053072393, -0.15877937, -0.065579735, 0.09932804, 0.06803561, 0.11719737, -0.0639249, -0.14433555, 0.1210262, 0.057197437, -0.19896421, 0.2387559, 0.35206345, -0.36237878, -0.12187686, 0.24411117));
	target1 += mul(ni2, float4x4(-0.44384086, -0.063816145, 0.14313193, 0.042119484, 0.13939157, 0.02742546, -0.010837158, -0.11953808, 0.083049394, 0.22100104, 0.12818006, -0.27883413, -0.22672728, 0.28484213, 0.053133663, -0.2364556));
	target1 += float4(-0.021096209, -0.016017085, 0.009604813, -0.023907887);
	
	float4 target2 = mul(a1, float4x4(-0.0028523596, -0.03346185, 0.30194005, 0.26316985, -0.108354695, 0.19872186, -0.14200853, 0.28833616, 0.009908957, 0.049258288, 0.18224198, -0.10260487, -0.011250263, -0.027236924, -0.03667901, 0.035038423));
	target2 += mul(b1, float4x4(-0.10311966, -0.07219459, 0.13850203, -0.07606934, 0.16186213, 0.20828444, -0.113131486, 0.21757083, -0.14668499, -0.015719058, -0.24257174, 0.33343476, 0.042950086, 0.014013289, -0.095338345, -0.10305408));
	target2 += mul(c1, float4x4(-0.054802764, -0.03977167, 0.058000274, 0.06520257, -0.06657145, 0.09240672, 0.18538313, -0.057592865, 0.11657067, 0.1435708, -0.10273095, -0.12743765, 0.031955425, -0.032504886, -0.043181136, -0.033146795));
	target2 += mul(d1, float4x4(0.17017461, 0.023741657, 0.12772968, 0.054186005, -0.06654799, 0.13280976, -0.08608028, -0.18274061, -0.14409089, -0.07336282, -0.21611233, 0.3030394, -0.22331606, -0.04621103, 0.008330136, 0.13817237));
	target2 += mul(e1, float4x4(0.12490482, -0.15037048, -0.308773, 0.015911192, -0.23533738, -0.3296806, -0.16643348, -0.18637176, 0.3725922, 0.22720487, 0.101118185, -0.1130553, 0.14583476, 0.25240546, 0.07148758, -0.023047412));
	target2 += mul(f1, float4x4(-0.21086755, 0.10873368, 0.23762812, -0.12715688, 0.07081291, -0.13914634, 0.23084821, -0.027778924, -0.011987815, -0.27076182, -0.037216157, -0.028230239, -0.055693954, -0.1835301, -0.19994727, -0.05208119));
	target2 += mul(g1, float4x4(0.0063535348, -0.047104187, 0.16425404, -0.003719743, -0.048529398, -0.17317753, -0.048431315, 0.12659106, -0.10488035, 0.14231968, -0.042716738, -0.20889871, -0.17258343, -0.1703121, -0.08134935, -0.08982632));
	target2 += mul(h1, float4x4(-0.047956906, 0.02809404, -0.23035078, 0.017211597, 0.056016855, -0.16766964, 0.024528379, 0.15064329, -0.20351112, -0.16514936, -0.013601919, 0.056769293, -0.4962774, -0.315249, -0.11879433, 0.24711494));
	target2 += mul(i1, float4x4(0.11659998, 0.23360188, -0.037019785, 0.01478414, -0.0056206854, -0.058505666, 0.05248197, 0.048921894, 0.014286839, -0.05282425, 0.12870628, 0.12335835, -0.2298834, -0.34117943, 0.07997886, 0.026714392));
	target2 += mul(a2, float4x4(-0.061862264, -0.024006715, 0.1239327, -0.06400159, -0.1736359, -0.13043079, -0.047287256, 0.092979684, -0.052144498, 0.046599787, 0.03391796, 0.10504723, 0.12802011, 0.05123276, 0.15974, 0.052002482));
	target2 += mul(b2, float4x4(0.22270438, -0.15555483, 0.10334453, -0.033303298, -0.13101499, 0.37030843, -0.6013731, 0.22835553, 0.17596579, -0.0012125646, 0.000731955, -0.03726906, -0.033959538, 0.10808315, 0.043731548, 0.12308547));
	target2 += mul(c2, float4x4(-0.02969512, 0.029882276, -0.04630252, -0.029674841, 0.15309821, 0.35859957, -0.15453109, 0.12228518, -0.062312573, -0.030289005, 0.10737798, 0.01659783, -0.07855408, 0.090425, -0.07134876, 0.10678761));
	target2 += mul(d2, float4x4(0.068100914, -0.16862309, 0.2094642, -0.02229975, -0.1045019, -0.043305863, 0.18030378, -0.085608065, -0.13514027, 0.11298315, -0.009570404, -0.03180812, 0.19233464, -0.058786966, -0.06211013, 0.1411384));
	target2 += mul(e2, float4x4(0.025985425, 0.08034115, 0.14352584, 0.039992746, -0.034363434, 0.27888498, -0.21407174, -0.15813926, -0.10999899, -0.15183197, -0.23127002, 0.10193841, 0.029217485, 0.069463246, 0.093416885, -0.17118438));
	target2 += mul(f2, float4x4(-0.34840858, 0.18985972, 0.1336286, 0.11525281, -0.12827359, -0.14426456, 0.13312757, -0.06653705, 0.050836112, 0.06430556, 0.14759327, 0.043341596, 0.17556888, -0.14065917, 0.046736937, -0.01075016));
	target2 += mul(g2, float4x4(0.13843128, -0.07928894, 0.09180436, -0.17497942, -0.0075141867, -0.13388832, 0.07851216, 0.16109377, 0.190535, -0.05877419, -0.010671232, 0.056779247, 0.005048462, 0.084110186, 0.053035934, -0.18244119));
	target2 += mul(h2, float4x4(0.2849969, -0.045472432, -0.007433944, 0.1180168, 0.060636494, 0.13299662, -0.14055575, 0.045158602, -0.08960926, 0.013875276, -0.01840017, -0.06518111, -0.062131494, 0.13034189, -0.086812675, -0.302895));
	target2 += mul(i2, float4x4(0.116545, 0.19018339, 0.015378095, 0.11349383, 0.097009905, 0.08176944, 0.033653412, -0.08137759, 0.03190533, 0.0014444767, -0.110253245, -0.075649016, -0.19349468, -0.22105742, -0.026897507, 0.11604942));
	target2 += mul(na1, float4x4(-0.1504224, -0.17498577, -0.111215115, -0.18127528, 0.01749777, 0.02000031, 0.10482995, 0.0281041, 0.14524435, 0.032341465, -0.1431526, 0.035199255, 0.07964384, 0.12253888, -0.04288506, 0.08354433));
	target2 += mul(nb1, float4x4(0.12756018, -0.09853538, -0.14765038, -0.14513035, -0.1674001, 0.041217312, 0.1293034, -0.14122911, 0.22340834, 0.026480576, 0.30324772, -0.14868121, -0.007565819, -0.118748344, 0.13285564, 0.00254272));
	target2 += mul(nc1, float4x4(0.049038395, -0.053402808, -0.20165808, -0.023086373, 0.11763773, -0.06335101, -0.033782937, 0.04164607, -0.18420623, -0.19549052, 0.046879902, 0.009774202, 0.01943834, 0.0056786705, -0.076153725, 0.02122122));
	target2 += mul(nd1, float4x4(-0.26325268, -0.031591017, -0.009897877, 0.1264021, 0.17001875, -0.0703141, 0.09623203, -0.3370359, 0.18430787, 0.27577603, 0.24643779, -0.0070356624, 0.14593714, -0.0370321, -0.09890827, -0.2308416));
	target2 += mul(ne1, float4x4(-0.07015072, 0.096860155, 0.237449, 0.02297801, 0.055075265, 0.27420217, 0.020077437, 0.18272734, -0.33312458, 0.10344985, -0.2324171, 0.065973476, -0.20755117, -0.27599007, -0.5285744, -0.4155286));
	target2 += mul(nf1, float4x4(0.07746828, -0.05007699, -0.0768586, 0.20854229, -0.03390625, 0.030966418, -0.019005757, -0.044068232, 0.107805826, 0.27990255, 0.20861949, -0.13380727, 0.11099171, -0.24434194, -0.44939178, 0.09977314));
	target2 += mul(ng1, float4x4(-0.025078122, 0.1578438, -0.087854326, 0.20194288, 0.09858984, 0.05664248, 0.007542862, -0.08403176, 0.1984592, -0.18137619, 0.008725761, 0.1726853, -0.05140944, 0.011235891, -0.043068934, 0.0055390405));
	target2 += mul(nh1, float4x4(0.11204605, 0.12672849, 0.21329713, 0.022434214, 0.18346484, 0.15253071, -0.07683229, -0.023870528, 0.15187134, 0.16080903, -0.20011483, -0.055256322, 0.2351501, -0.10940274, 0.3296904, -0.08561938));
	target2 += mul(ni1, float4x4(-0.116775244, -0.034662195, 0.09385859, 0.015938438, 0.04321025, 0.07417467, -0.020254206, -0.021971289, 0.082221195, -0.17118677, -0.0803086, -0.04383127, -0.040375095, 0.19555633, -0.02246454, -0.039880734));
	target2 += mul(na2, float4x4(0.091371894, 0.07079164, -0.061235007, -0.074043915, -0.07527448, 0.20644194, -0.025892835, 0.288527, -0.0681896, -0.18799694, -0.02397431, -0.20931835, -0.23740639, -0.21364078, -0.066721395, 0.046312522));
	target2 += mul(nb2, float4x4(0.028192231, -0.076668344, -0.048378978, 0.08975191, 0.11627764, 0.1206538, 0.13604914, -0.32365093, -0.3457084, 0.038705852, -0.11968679, -0.03261415, -0.08069945, 0.06448551, -0.04925646, -0.112106614));
	target2 += mul(nc2, float4x4(0.02704155, -0.38007632, -0.12648691, 0.0065948786, -0.107972704, -0.07776103, -0.09248745, -0.034419768, 0.08890347, 0.005535876, -0.17764446, 0.01000324, 0.09786164, 0.0046856827, 0.071132, -0.07665281));
	target2 += mul(nd2, float4x4(0.084308736, 0.12159227, 0.09618809, -0.095442675, -0.19964129, 0.12708808, 0.13591234, -0.08824806, 0.13553478, 0.116799936, 0.2792844, 0.3961157, -0.21412137, 0.35075518, -0.4579121, 0.13404456));
	target2 += mul(ne2, float4x4(0.14321525, -0.2190369, -0.0017736118, 0.04966717, -0.21058443, 0.03795149, 0.024490742, 0.16852312, 0.23549259, 0.098397486, 0.39796385, -0.17331013, 0.57935876, 0.14168213, 0.10026468, 0.12272344));
	target2 += mul(nf2, float4x4(0.23204985, -0.13793765, -0.09148106, -0.24824184, 0.03669933, 0.40166143, -0.26450562, 0.1297136, -0.0033718192, -0.14882618, -0.027010696, 0.15510502, -0.13116877, 0.13243876, 0.29493085, -0.159246));
	target2 += mul(ng2, float4x4(0.14412704, -0.043252222, 0.038036022, 0.21720204, 0.16157351, 0.25108346, 0.05750981, -0.11665398, 0.15858066, 0.13024889, -0.30726764, -0.017599456, -0.13654993, 0.16810633, -0.22028227, 0.054728623));
	target2 += mul(nh2, float4x4(0.04761309, -0.1425047, 0.121673144, -0.0067104516, 0.13617267, 0.11343255, 0.14773287, -0.111807466, 0.29447448, 0.043634728, 0.040583152, -0.22446026, -0.17453341, -0.1746306, -0.27435815, 0.1823859));
	target2 += mul(ni2, float4x4(0.0914674, -0.1296898, 0.081237026, -0.03182429, -0.082685776, 0.08469174, -0.014075701, 0.043630067, -0.02028655, 0.043165345, 0.14293486, 0.03086512, 0.16910627, 0.32537475, -0.022409504, -0.15651123));
	target2 += float4(-0.010434646, -0.007589305, 0.03614506, 0.017320616);

	tex3[gxy] = target1;
	tex4[gxy] = target2;
}


//!PASS 3
//!DESC Conv-4x3x3x16
//!IN tex3, tex4
//!OUT tex1, tex2
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass3(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [ a, d, g ]
	// [ b, e, h ]
	// [ c, f, i ]
	float4 a1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	float4 b1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 c1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 d1 = tex3.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 e1 = tex3.SampleLevel(sam, pos, 0);
	float4 f1 = tex3.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 g1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 h1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 i1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	float4 na1 = max(-a1, 0);
	float4 nb1 = max(-b1, 0);
	float4 nc1 = max(-c1, 0);
	float4 nd1 = max(-d1, 0);
	float4 ne1 = max(-e1, 0);
	float4 nf1 = max(-f1, 0);
	float4 ng1 = max(-g1, 0);
	float4 nh1 = max(-h1, 0);
	float4 ni1 = max(-i1, 0);

	a1 = max(a1, 0);
	b1 = max(b1, 0);
	c1 = max(c1, 0);
	d1 = max(d1, 0);
	e1 = max(e1, 0);
	f1 = max(f1, 0);
	g1 = max(g1, 0);
	h1 = max(h1, 0);
	i1 = max(i1, 0);

	float4 a2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	float4 b2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 c2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 d2 = tex4.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 e2 = tex4.SampleLevel(sam, pos, 0);
	float4 f2 = tex4.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 g2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 h2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 i2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	float4 na2 = max(-a2, 0);
	float4 nb2 = max(-b2, 0);
	float4 nc2 = max(-c2, 0);
	float4 nd2 = max(-d2, 0);
	float4 ne2 = max(-e2, 0);
	float4 nf2 = max(-f2, 0);
	float4 ng2 = max(-g2, 0);
	float4 nh2 = max(-h2, 0);
	float4 ni2 = max(-i2, 0);

	a2 = max(a2, 0);
	b2 = max(b2, 0);
	c2 = max(c2, 0);
	d2 = max(d2, 0);
	e2 = max(e2, 0);
	f2 = max(f2, 0);
	g2 = max(g2, 0);
	h2 = max(h2, 0);
	i2 = max(i2, 0);

	float4 target1 = mul(a1 , float4x4(-0.13837793, -0.05485083, -0.11455316, -0.23340753, -0.0019060506, 0.014073009, -0.006939684, 0.013028122, 0.0070730825, 0.07576272, -0.046580516, -0.10440529, -0.14514132, 0.118322305, 0.15771718, -0.10368211));
	target1 += mul(b1 , float4x4(0.0033887655, 0.105515614, -0.022994144, 0.20720185, 0.014884114, 0.04068789, 0.14153919, -0.20556916, -0.106429465, 0.15578473, 0.17493904, 0.27673894, 0.068228535, -0.17600566, 0.117449455, 0.1464803));
	target1 += mul(c1 , float4x4(-0.13132714, 0.057890475, -0.15540479, -0.05904855, 0.10353088, -0.014439668, 0.10086415, -0.035871122, 0.15052663, -0.017041713, 0.039761867, -0.20937371, 0.034573525, -0.10553617, 0.054362305, -0.1131222));
	target1 += mul(d1 , float4x4(0.043574177, -0.12957309, 0.06257102, 0.059841774, 0.025344899, -0.08974694, -0.02106646, -0.083086155, 0.044729084, 0.07391884, 0.026620472, 0.029281206, 0.0439835, -0.040075477, 0.20127645, 0.08995277));
	target1 += mul(e1 , float4x4(0.4999535, -0.16519144, -0.082569085, 0.08618579, 0.20361039, -0.16703975, 0.013153, 0.19670624, 0.2853662, -0.11533776, -0.109247334, 0.06345074, 0.0635587, 0.104548655, 0.31481677, 0.2264137));
	target1 += mul(f1 , float4x4(0.029020509, 0.24992752, 0.5356852, 0.16753946, -0.057788625, 0.02481026, 0.2059601, 0.16616577, -0.078577444, 0.307072, -0.08813778, 0.09565001, 0.02336261, -0.1928066, -0.0876608, -0.15015014));
	target1 += mul(g1 , float4x4(0.079325944, 0.062302455, -0.10960886, -0.0025239969, 0.02924214, 0.026142644, 0.0821952, 0.008034841, 0.121127814, 0.109858714, -0.053383715, 0.22161359, 0.03572371, 0.013655175, -0.27442926, 0.038870957));
	target1 += mul(h1 , float4x4(-0.057337154, 0.03110442, 0.07781571, 0.17697155, 0.10287903, 0.10192227, -0.036485553, -0.07374302, 0.081604324, 0.1566476, 0.23592737, 0.09747013, -0.095862515, -0.1456753, 0.057745866, -0.17291927));
	target1 += mul(i1 , float4x4(-0.070838295, 0.15307118, -0.16363588, 0.08716062, -0.012117197, 0.09717453, -0.057428412, 0.019387208, -0.011734471, 0.28442237, -0.0138647, 0.1619097, 0.046210334, -0.06327717, 0.035489302, -0.055681925));
	target1 += mul(a2 , float4x4(0.03013589, -0.16602181, 0.00417107, -0.022452664, 0.03615902, 0.09972378, -0.043409757, -0.028448848, -0.055032104, 0.08841848, -0.12667881, -0.06865927, -0.020555131, -0.09292043, 0.03676055, 0.1469838));
	target1 += mul(b2 , float4x4(-0.03413294, 0.1853497, -0.21196875, -0.2122292, -0.046451584, 0.33640862, 0.11074589, 0.0038315703, -0.32845765, -0.018679257, -0.24653831, 0.15887207, 0.018353334, -0.32828686, 0.070055164, 0.22649562));
	target1 += mul(c2 , float4x4(-0.11916958, -0.11415976, 0.10442459, 0.026471134, 0.15098315, 0.030927312, 0.08758737, 0.04333666, -0.40113255, -0.22881064, -0.18026584, 0.010727328, 0.04185913, -0.033592816, 0.019705769, -0.09931264));
	target1 += mul(d2 , float4x4(-0.024548884, 0.070657834, 0.04315053, -0.09292352, -0.023722034, -0.06946775, -0.017095754, 0.14835307, -0.030104019, 0.13317905, 0.052319784, 0.18717423, 0.024163181, 0.04198584, 0.024550296, 0.09124823));
	target1 += mul(e2 , float4x4(0.36571705, 0.28200173, -0.031241365, 0.32836193, 0.15685938, 0.14499408, -0.03710283, -0.105348445, 0.2111411, -0.14371765, 0.22234774, 0.12241296, 0.12288187, -0.29133305, -0.23992771, 0.102442585));
	target1 += mul(f2 , float4x4(-0.30764952, -0.38244587, -0.021552043, 0.02371933, -0.043024007, -0.09969857, -0.08754052, -0.05584622, 0.32374346, 0.21340463, 0.110945925, -0.19904156, -0.1425847, -0.16605812, -0.14231746, -0.14958249));
	target1 += mul(g2 , float4x4(0.07996341, 0.036993798, 0.013508032, 0.015037227, -0.05188829, -0.08953449, 0.024801137, 0.20462893, -0.013199976, -0.19138044, -0.056084607, 0.14518543, -0.029846137, 0.033994604, 0.006321045, -0.19919403));
	target1 += mul(h2 , float4x4(0.09230355, 0.17356168, -0.057000663, -0.10861382, 0.038658872, 0.01218888, 0.19109936, 0.021123217, 0.042241503, 0.008514072, 0.031998653, 0.014464009, -0.036506813, 0.27310863, -0.052563597, -0.21396813));
	target1 += mul(i2 , float4x4(-0.036255296, -0.11269877, -0.07659167, -0.09690911, -0.027218975, 0.105712906, 0.05750272, 0.14222133, 0.016676722, 0.0962918, 0.05514509, -0.023851087, -0.07117767, -0.020666543, -0.23072378, -0.043072045));
	target1 += mul(na1 , float4x4(0.1542571, -0.13077767, -0.0881844, -0.00536349, 0.019526271, -0.1529528, -0.11660246, -0.1698449, 0.019345785, -0.039716557, 0.14421196, 0.08003151, 0.15334103, -0.16667187, 0.006088769, 0.13378714));
	target1 += mul(nb1 , float4x4(-0.08746076, 0.060209457, 0.051582757, 0.02863364, 0.1454257, 0.42194176, 0.09892967, 0.26043537, -0.06934357, -0.1020657, 0.23833197, 0.15991127, 0.09294198, 0.017690487, 0.11748737, -0.2849694));
	target1 += mul(nc1 , float4x4(0.12285264, 0.073884204, -0.027040116, 0.03438263, -0.060739577, 0.17927702, 0.16900496, 0.3545027, 0.1545223, -0.09951323, 0.42339948, 0.14226453, 0.10644413, 0.15645456, -0.03346077, -0.009488195));
	target1 += mul(nd1 , float4x4(-0.080912456, 0.16929491, 0.027275667, -0.020797532, 0.05746718, -0.071174294, 0.3193612, 0.055932105, 0.031726856, -0.03390961, 0.13757136, -0.017296424, -0.041106436, 0.02487556, -0.29788992, -0.29300368));
	target1 += mul(ne1 , float4x4(-0.32740706, 0.4221705, 0.35447162, 0.13970987, 0.07307587, 0.65598255, 0.7267268, 0.35669217, -0.24410655, 0.30564576, -0.033510603, 0.20394838, -0.012135275, -0.12212605, 0.00741055, -0.12938774));
	target1 += mul(nf1 , float4x4(0.17577599, -0.075835876, -0.06821395, -0.19289997, 0.048764437, 0.11093425, 0.15844633, 0.21540429, -0.14261006, -0.12678951, -0.05380409, 0.21502183, -0.053737447, -0.23268248, 0.077271536, -0.11794149));
	target1 += mul(ng1 , float4x4(-0.06283879, -0.11581014, -0.0077474653, -0.051150266, 0.017263902, -0.12403667, 0.13689952, -0.13955206, -0.036969677, 0.04593233, 0.31484202, -0.021023672, 0.006109164, -0.022175733, 0.32699695, -0.26805824));
	target1 += mul(nh1 , float4x4(0.07529928, -0.020912366, -0.14532542, -0.13928838, 0.07875855, -0.18651104, 0.47042093, 0.342289, -0.06575549, -0.13776249, 0.21936299, 0.124723375, 0.05280059, -0.07600857, 0.0027616988, 0.11619774));
	target1 += mul(ni1 , float4x4(0.06760704, -0.11735106, 0.07262433, -0.040624633, 0.35947633, 0.29390943, 0.025136888, -0.12812558, 0.17102966, -0.15462054, 0.37353945, 0.030337518, -0.01959842, -0.07917661, 0.036980435, -0.008516924));
	target1 += mul(na2 , float4x4(-0.07072042, 0.19770962, -0.039348952, 0.06489617, -0.03382829, -0.11054973, -0.035438936, 0.011020459, -0.050599303, -0.07308136, -0.029521624, 0.0694216, 0.021597218, 0.07275136, 0.1196986, -0.021191286));
	target1 += mul(nb2 , float4x4(0.027155813, -0.024280313, -0.1322834, 0.219577, 0.013412778, 0.027934693, -0.10113296, 0.16649272, 0.029343246, 0.08333487, -0.09067474, -0.06318277, 0.016611677, 0.22737436, 0.11019619, -0.11105013));
	target1 += mul(nc2 , float4x4(-0.10233781, 0.04936236, -0.12536384, 0.1270058, -0.07842266, 0.0018531404, 0.021235077, -0.13014361, -0.06192502, -0.07054751, -0.05475905, 0.053059015, 0.15269022, 0.11485296, -0.09188326, -0.13495958));
	target1 += mul(nd2 , float4x4(-0.038748156, 0.17470013, 0.008070423, 0.47245374, 0.14041074, 0.0029743444, 0.09280988, -0.16300924, 0.025588343, 0.042092193, 0.021749513, -0.07912978, -0.08887605, -0.06223286, -0.017056612, 0.11412155));
	target1 += mul(ne2 , float4x4(-0.62907135, -0.054125126, -0.3091851, -0.2599738, 0.24917828, 0.36420155, 0.07772076, 0.039711658, -0.25157338, -0.023638945, -0.054642107, 0.25950512, -0.26855794, 0.16737756, -0.011335203, -0.383141));
	target1 += mul(nf2 , float4x4(0.019852921, 0.2599611, -0.17794183, 0.086149536, 0.27634904, 0.21865687, -0.047085866, -0.08818839, 0.013813605, 0.21364933, -0.22009525, -0.030338509, -0.1512191, 0.042633552, -0.17577383, 0.0662118));
	target1 += mul(ng2 , float4x4(-0.15513746, -0.07846239, -0.13220623, 0.106471166, 0.05573645, 0.16334842, -0.07945537, -0.19104981, -0.013032576, 0.08952008, 0.055789266, -0.035824023, -0.017594192, -0.04652965, -0.30483812, 0.054347165));
	target1 += mul(nh2 , float4x4(0.054959666, -0.055990525, 0.142193, 0.09890494, 0.047798425, 0.15105279, -0.16933344, -0.08214855, -0.11551477, 0.06605292, -0.20606443, -0.04266445, 0.01709317, -0.097884715, -0.21919689, 0.024738865));
	target1 += mul(ni2 , float4x4(0.009435747, -0.0011143036, 0.08239794, 0.06413721, -0.09412612, -0.07816752, 0.0070877066, -0.054295234, -0.02152047, -0.057394918, -0.02919309, 0.020081067, -0.03086805, -0.056924064, 0.026491363, 0.015115628));
	target1 += float4(0.0012513401, 0.026057906, 0.010539876, 0.009830541);

	float4 target2 = mul(a1 , float4x4(0.057580933, 0.01911187, -0.008719538, -0.017969212, -0.24201718, -0.058988657, -0.0025294814, -0.011815471, -0.10723921, -0.19644037, 0.020027963, 0.035667516, 0.08559372, 0.27885816, 0.064953476, -0.05350714));
	target2 += mul(b1 , float4x4(0.045166798, -0.10751055, 0.053228382, 0.12519331, 0.25117958, 0.06537292, -0.15587787, -0.07206778, 0.06837826, -0.043205425, -0.12318706, -0.15438488, 0.03246428, 0.074852064, 0.062248066, -0.009753593));
	target2 += mul(c1 , float4x4(0.09432274, 0.011490796, -0.04061421, -0.16661306, 0.05983951, -0.020288106, 0.029864697, 0.013547436, 0.021477815, -0.07964464, 0.045970913, 0.16307391, -0.04462305, 0.046968628, 0.029862186, 0.11649774));
	target2 += mul(d1 , float4x4(-0.14416671, 0.12803924, -0.18187119, -0.15840115, -0.10262368, 0.043705128, -0.10072281, -0.06468493, -0.043781534, -0.015372785, -0.16681372, 0.05544772, 0.03745967, -0.30812827, 0.007939141, -0.14296778));
	target2 += mul(e1 , float4x4(-0.10959357, -0.23731256, 0.17797269, 0.1566764, -0.04658083, -0.08226227, 0.065335095, 0.12799698, -0.04347693, 0.07439239, 0.124068074, -0.12433461, 0.39953944, -0.4547675, -0.15020098, 0.029323136));
	target2 += mul(f1 , float4x4(0.008521254, -0.42012635, -0.103437595, -0.14140712, -0.13161388, -0.04296955, -0.03418078, -0.11619488, -0.113280736, -0.04337926, -0.15152383, -0.025833655, -0.037376937, 0.028724195, 0.035038933, 0.07584188));
	target2 += mul(g1 , float4x4(-0.0015208189, 0.02739281, 0.07880385, -0.05264596, 0.0006570586, -0.053559244, -0.060818747, 0.0014127572, -0.12706065, -0.22151639, 0.0050813453, 0.03868026, 0.017800469, 0.08653453, 0.12580872, 0.10975057));
	target2 += mul(h1 , float4x4(-0.016247116, 0.060199317, 0.08806292, -0.0053588278, -0.036467995, 0.14757904, 0.010754306, 0.060899615, -0.1110678, -0.050623085, -0.021320427, -0.1970179, 0.1138183, 0.02913883, 0.011708719, -0.0031000238));
	target2 += mul(i1 , float4x4(-0.07490824, 0.26012638, 0.023562372, -0.04637545, 0.015996248, -0.02646302, 0.0035619289, 0.009863964, 0.004463742, -0.13282667, 0.04664953, -0.07782132, 0.026051573, -0.13182716, -0.03470513, 0.082064465));
	target2 += mul(a2 , float4x4(-0.04953172, 0.055778302, -0.007956572, 0.06453463, -0.112365015, 0.031204617, -0.031123973, 0.13237885, -0.08588153, -0.15947914, 0.25160727, -0.0101198545, -0.09021632, 0.0047894586, -0.008986191, -0.11622616));
	target2 += mul(b2 , float4x4(-0.07868324, -0.03463213, 0.011006695, 0.0033356217, 0.032993857, -0.10459223, -0.11331984, -0.17686851, -0.13263261, -0.22213052, 0.10728409, -0.1332059, -0.0019830016, -0.062202223, -0.12468388, -0.102955));
	target2 += mul(c2 , float4x4(0.06774758, -0.019607622, -0.054707706, -0.032572657, -0.056999106, 0.0063034142, -0.0040720105, 0.04574635, 0.21903323, 0.13402393, -0.07107346, -0.08973124, 0.020255536, -0.021238161, -0.104306765, -0.009237116));
	target2 += mul(d2 , float4x4(0.10643413, -0.116414584, -0.097003944, -0.07626372, -0.28920346, 0.023803055, 0.07691808, 0.015008518, -0.1373578, 0.01935071, -0.07574301, 0.052712873, 0.09657614, -0.02497193, -0.0043158466, -0.09333523));
	target2 += mul(e2 , float4x4(-0.25414145, -0.26243624, 0.12937985, -0.17944777, 0.3664257, -0.06487048, -0.10509681, -0.10630014, 0.075283855, 0.4662916, -0.0014712083, -0.29535007, -0.13022043, 0.25766194, 0.14769283, 0.10512634));
	target2 += mul(f2 , float4x4(0.0689302, 0.022661772, -0.043984957, 0.13596754, -0.06885858, 0.059424106, 0.017230453, 0.08873089, 0.32940426, -0.12071059, -0.08323642, 0.23238598, -0.06291085, 0.09859973, -0.00700876, 0.12986307));
	target2 += mul(g2 , float4x4(-0.060717613, 0.030503884, -0.06323549, 0.0059739, 0.08190414, 0.047006775, -0.0023143853, -0.027005509, -0.17168896, 0.20450558, 0.047933526, -0.03124133, 0.14688456, 0.044616744, -0.07474459, 0.13730273));
	target2 += mul(h2 , float4x4(0.16486372, 0.13952915, 0.1093748, -0.031850196, -0.14090009, -0.120079, 0.023628023, -0.077027865, 0.09811187, -0.033171307, -0.03331771, -0.038893215, 0.011286584, -0.15111947, -0.017341943, 0.0015878569));
	target2 += mul(i2 , float4x4(-0.09481133, 0.107053526, 0.047643233, 0.16805217, -0.0678524, -0.07519125, -0.02258995, -0.13705339, 0.16563395, -0.16972524, 0.04326224, 0.024816778, 0.010601283, -0.041156873, 0.062542215, 0.047571044));
	target2 += mul(na1 , float4x4(0.12500185, -0.006936863, 0.024489427, 0.1740798, -0.036525737, -0.12511401, 0.07990025, 0.02839474, -0.12753619, -0.06010621, 0.08279232, -0.081194244, -0.09039855, -0.18112564, -0.26476932, -0.031485114));
	target2 += mul(nb1 , float4x4(-0.100102335, 0.22857793, 0.14938287, -0.014803003, -0.17217094, -0.62685734, 0.095959224, 0.17851897, 0.06559054, 0.10896349, 0.0067452556, -0.07877991, 0.15820199, -0.19860643, -0.23554341, 0.108216554));
	target2 += mul(nc1 , float4x4(-0.06440183, -0.086768515, 0.06501931, -0.013325654, 0.048092242, -0.2516782, -0.07378936, -0.5093634, -0.21180914, 0.028729467, 0.097722694, -0.10443471, 0.087278105, -0.13554108, -0.07925715, 0.025918096));
	target2 += mul(nd1 , float4x4(0.101379745, -0.098901495, 0.088425554, 0.10312074, -0.06832467, -0.14247051, -0.06577163, 0.038505282, -0.058837283, -0.041290045, 0.024700344, -0.03952513, 0.050091445, 0.20111398, -0.12729187, 0.17162229));
	target2 += mul(ne1 , float4x4(-0.14357474, -0.52516335, -0.28848764, -0.25948864, -0.6469683, -0.25461218, -0.12740892, -0.23631012, -0.14096075, -0.28670883, 0.12026559, -0.17575467, 0.40593022, 0.09236864, 0.11895183, -0.21580887));
	target2 += mul(nf1 , float4x4(-0.027686533, -0.014736693, 0.11776454, 0.104835264, -0.1122669, -0.10067572, 0.054669123, -0.3256272, -0.1618158, 0.24705333, 0.07530265, -0.16693603, 0.11981224, -0.01764311, 0.035309367, 0.18991415));
	target2 += mul(ng1 , float4x4(0.075753845, 0.030512655, -0.033218108, -0.0020751022, -0.059813447, 0.13577273, -0.17669228, -0.015658198, -0.03524086, -0.027248759, 0.011696296, -0.13176118, 0.13976848, -0.11381985, -0.069327734, -0.04551793));
	target2 += mul(nh1 , float4x4(0.025345126, -0.017192554, -0.10062235, 0.19348828, -0.1404843, 0.19161314, -0.266943, -0.30460906, -0.25685784, 0.023311002, -0.21997964, -0.04452797, 0.039271735, -0.0077815196, 0.05758964, 0.08804478));
	target2 += mul(ni1 , float4x4(0.04886391, -0.017406208, -0.038027596, 0.012643386, 0.14007851, 0.0012767792, -0.115759425, 0.097489856, 0.17599659, -0.050711423, -0.084151536, -0.15770845, -0.08287477, 0.120081306, 0.015947923, -0.06668065));
	target2 += mul(na2 , float4x4(-0.10980535, 0.12270783, 0.063907675, -0.07847296, 0.10355225, -0.31747913, -0.10403689, 0.005290646, 0.07667107, -0.10277437, -0.08069292, -0.02559804, -0.047999926, 0.29834297, -0.036717292, -0.05650061));
	target2 += mul(nb2 , float4x4(-0.103821196, -0.009593053, 0.066295676, -0.38370672, -0.037927628, -0.2711836, 0.1377398, -0.08418159, -0.0737972, -0.039782777, 0.13933297, 0.04516865, -0.06268818, 0.337236, -0.121655226, -0.008400626));
	target2 += mul(nc2 , float4x4(-0.15113829, 0.0017028727, -0.02523152, 0.020020628, 0.14301538, -0.20421621, -0.07266804, 0.04835691, -0.03385325, 0.06579219, -0.026479365, -0.037032843, 0.038153037, 0.014210751, -0.03542229, 0.0710242));
	target2 += mul(nd2 , float4x4(0.0054667736, 0.01744876, 0.3065127, 0.049586684, 0.18856415, 0.2730343, -0.2333077, 0.0068653813, 0.3263104, 0.1581569, -0.067741506, -0.10893117, -0.23163976, 0.0029724934, 0.21427019, -0.05729933));
	target2 += mul(ne2 , float4x4(0.2783585, 0.17852917, -0.1389073, 0.1369532, -0.10491301, 0.3753245, -0.2739856, 0.18703647, -0.64889586, 0.06298504, -0.29364008, 0.17944366, 0.09733316, -0.21755181, 0.090409346, -0.022404745));
	target2 += mul(nf2 , float4x4(-0.08643692, 0.043516237, 0.07125337, -0.23520306, 0.042653214, 0.058355685, -0.13027787, -0.0015239809, 0.06168663, 0.04952333, 0.03217504, -0.094814256, 0.25104445, 0.06959146, -0.14522897, -0.034003105));
	target2 += mul(ng2 , float4x4(0.10193368, 0.109207876, 0.06922978, -0.035177775, 0.08234648, -0.24269609, 0.05216447, 0.07194904, 0.08424774, -0.023948545, 0.1292036, -0.16073976, -0.11004149, -0.14011864, 0.05699544, 0.08603814));
	target2 += mul(nh2 , float4x4(-0.159505, 0.011439578, 0.031358175, -0.074699186, 0.16425711, -0.29734048, 0.06415531, 0.09782104, -0.047154855, -0.053923853, 0.13791925, 0.01920221, 0.2510621, 0.011180524, -0.02389365, 0.22188987));
	target2 += mul(ni2 , float4x4(-0.052833233, -0.0011790307, 0.01832988, -0.087143995, 0.16383314, -0.018386772, 0.018473852, 0.022136362, 0.00095872144, 0.059976995, 0.00461632, 0.006194564, -0.05576084, 0.19239509, 0.07017777, -0.0542914));
	target2 += float4(-0.03782637, -0.0035752894, -0.010155095, -0.025359483);

	tex1[gxy] = target1;
	tex2[gxy] = target2;
}


//!PASS 4
//!DESC Conv-4x3x3x16, Depth-to-Space
//!IN INPUT, tex1, tex2
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64

void Pass4(uint2 blockStart, uint3 threadId) {
	uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
	
	const uint2 outputSize = GetOutputSize();
	if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = ((gxy >> 1) + 0.5f) * inputPt;

	// [ a, d, g ]
	// [ b, e, h ]
	// [ c, f, i ]
	float4 a1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	float4 b1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 c1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 d1 = tex1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 e1 = tex1.SampleLevel(sam, pos, 0);
	float4 f1 = tex1.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 g1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 h1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 i1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	float4 na1 = max(-a1, 0);
	float4 nb1 = max(-b1, 0);
	float4 nc1 = max(-c1, 0);
	float4 nd1 = max(-d1, 0);
	float4 ne1 = max(-e1, 0);
	float4 nf1 = max(-f1, 0);
	float4 ng1 = max(-g1, 0);
	float4 nh1 = max(-h1, 0);
	float4 ni1 = max(-i1, 0);

	a1 = max(a1, 0);
	b1 = max(b1, 0);
	c1 = max(c1, 0);
	d1 = max(d1, 0);
	e1 = max(e1, 0);
	f1 = max(f1, 0);
	g1 = max(g1, 0);
	h1 = max(h1, 0);
	i1 = max(i1, 0);

	float4 a2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	float4 b2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 c2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 d2 = tex2.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 e2 = tex2.SampleLevel(sam, pos, 0);
	float4 f2 = tex2.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 g2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 h2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 i2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	float4 na2 = max(-a2, 0);
	float4 nb2 = max(-b2, 0);
	float4 nc2 = max(-c2, 0);
	float4 nd2 = max(-d2, 0);
	float4 ne2 = max(-e2, 0);
	float4 nf2 = max(-f2, 0);
	float4 ng2 = max(-g2, 0);
	float4 nh2 = max(-h2, 0);
	float4 ni2 = max(-i2, 0);

	a2 = max(a2, 0);
	b2 = max(b2, 0);
	c2 = max(c2, 0);
	d2 = max(d2, 0);
	e2 = max(e2, 0);
	f2 = max(f2, 0);
	g2 = max(g2, 0);
	h2 = max(h2, 0);
	i2 = max(i2, 0);

	float4 target1 = mul(a1, float4x4(0.02066797, -0.013601862, -0.007048889, -0.010436224, -0.013475746, 0.017484829, 0.003569871, 0.010704422, -0.013622159, 0.0051929723, -0.01672668, -0.011980923, 0.03233822, -0.008870257, 0.024951939, 0.011703474));
	target1 += mul(b1, float4x4(0.09294306, 0.0014346406, 0.08119577, -0.008968148, -0.003100696, 0.026659792, -0.012744048, 0.02888033, -0.032950055, 0.016204342, -0.017369213, 0.022491494, 0.049124617, -0.027378289, 0.02111168, -0.037705775));
	target1 += mul(c1, float4x4(-0.008216952, -0.021160914, 0.040945128, -0.010472813, -0.005768232, 0.0016923469, -0.0061864546, 0.0007168136, -0.014967856, -0.0013775446, -0.03696006, -0.0011331941, 0.006405253, 0.0030893548, 0.052975312, -0.0077081146));
	target1 += mul(d1, float4x4(0.1026977, 0.06607977, -0.025372373, -0.03354179, -0.034359697, -0.046674587, -0.012842571, 0.0041260347, 0.0189941, -0.029389031, 0.028211223, 0.010089593, -0.0057367194, 0.053311557, -0.037211698, 0.0035831304));
	target1 += mul(e1, float4x4(0.21819879, 0.24159485, 0.25479653, 0.22689046, -0.14693534, -0.101709016, -0.12968269, -0.11193918, -0.17564386, -0.09321151, -0.1088182, -0.098540194, 0.016255755, 0.11983511, 0.07260058, 0.11377339));
	target1 += mul(f1, float4x4(-0.0065045636, -0.014916138, 0.08657154, 0.08976987, -0.013105138, -0.014341284, -0.07822386, -0.05141758, -0.008127414, -0.023220202, -0.07729052, -0.05847569, -0.014462153, -0.008247349, -0.029479884, 0.053986717));
	target1 += mul(g1, float4x4(-0.05111642, 0.013267287, -0.010962933, -0.0062592067, 0.031603336, 0.017792836, 0.040512457, 0.021096494, 0.025952626, 0.03858803, 0.009819873, 0.014087486, -0.016874991, -0.021088999, 0.0048008533, -0.014261808));
	target1 += mul(h1, float4x4(-0.054689944, 0.019172559, -0.10202758, 0.01563535, 0.024953881, -0.042438734, 0.03068713, -0.019600231, 0.0026854384, -0.08990359, 0.038635172, -0.0014791996, -0.045267813, -0.06297795, -0.050205074, -0.036708377));
	target1 += mul(i1, float4x4(-0.009820029, -0.007953665, -0.0030171487, 0.025994476, 0.0055633057, 0.0023106632, -0.0011157666, -0.032652337, 0.02231576, 0.0150678335, -0.00035031908, -0.049004823, -0.0048957765, -0.007189859, -0.022652647, -0.039600767));
	target1 += mul(a2, float4x4(-0.025157524, 0.005564745, 0.0077270563, -0.007929144, 0.07835409, -0.03961485, 0.0042581395, -0.012796515, 0.018968092, 0.0065416014, -0.018439304, -0.031132344, -0.008802365, -0.019820224, -0.053818177, 0.019140625));
	target1 += mul(b2, float4x4(0.024769353, -0.016378574, 0.11481205, -0.026075391, -0.028652724, -0.008835214, 0.06427986, -0.04859151, -0.014552956, -0.028410029, 0.111585446, 0.009804185, -0.042241797, 0.0019805022, 0.03361954, -0.017370641));
	target1 += mul(c2, float4x4(0.034053475, 0.038674638, -0.04696516, -0.0045251776, -0.0039383816, -0.0021523088, -0.021408198, -0.0003014931, -0.023644248, 0.020932276, -0.108067356, 0.030942762, -0.026552102, -0.0056460355, 0.008178258, -0.0009253607));
	target1 += mul(d2, float4x4(0.078304745, 0.011302627, -0.036976036, -0.013908656, -0.10690595, 0.016818136, 0.015758326, -0.016121918, -0.037873853, 0.00037048876, -0.009456388, -0.016097274, 0.057774395, 0.15647416, 0.00012419945, -0.085859895));
	target1 += mul(e2, float4x4(-0.2873381, -0.16951321, 0.14959157, 0.36375824, -0.16162755, 0.30172205, -0.28958184, 0.2487593, -0.10533105, 0.42946494, -0.20263031, 0.23740861, 0.023840647, -0.06442918, -0.29647085, -0.12943329));
	target1 += mul(f2, float4x4(0.0029997546, 0.023317888, -0.023340696, -0.09114311, -0.05119816, -0.08453361, -0.041490365, 0.006037165, 0.016185664, -0.08495001, 0.08690409, 0.10635855, -0.0121372985, -0.0061607, -0.03049874, 0.03893408));
	target1 += mul(g2, float4x4(-0.010922993, 0.03413124, -0.011615248, -0.016528865, 0.015621528, 0.08856427, 0.01220382, 0.0052098404, 0.04833281, 0.037831176, 0.026502393, 0.0107578635, 0.029293455, -0.0051649977, 0.007572071, 0.010101437));
	target1 += mul(h2, float4x4(0.0016892543, -0.086751014, -0.0041350387, -0.06005637, 0.041606825, -0.19200447, 0.028292444, -0.017876074, 0.0101423515, -0.08625792, 0.09077659, 0.024676733, -0.046471898, -0.021680003, 0.007921834, -0.079603985));
	target1 += mul(i2, float4x4(0.017057033, -0.008013634, -0.018102577, -0.01345755, 0.0620006, -0.025968201, 0.07138734, -0.09975251, -0.005465781, -0.011184834, -0.0651285, -0.079556055, -0.00055764546, -0.007492052, -0.029603545, -0.07078825));
	target1 += mul(na1, float4x4(-0.10589389, 0.021210285, -0.028930133, 0.016055413, -0.18338472, -0.051082015, 0.1020663, 0.014365114, 0.11688869, -0.039388333, 0.009866559, -0.03795289, 0.013310502, 0.023231732, 0.007659133, -0.0027602138));
	target1 += mul(nb1, float4x4(0.0025395593, 0.0073232944, -0.04352944, 0.019918712, 0.020041449, -0.073162615, 0.096950345, 0.021914015, -0.17350666, 0.12829295, 0.07900105, 0.03115375, -0.012729986, 0.025017815, -0.19016227, 0.03532046));
	target1 += mul(nc1, float4x4(-0.004561337, 0.018127436, -0.03147566, 0.014196702, 0.016791651, -0.0021540376, -0.021546744, -0.006671925, -0.008601794, 0.0384946, -0.16477007, 0.122372, -0.03592093, -0.016040638, 0.025269061, 0.023783052));
	target1 += mul(nd1, float4x4(0.17777547, -0.29313773, 0.15184408, -0.026345825, 0.046505112, -0.2121665, 0.08373203, 0.1717021, -0.028687157, -0.07293457, -0.062076677, 0.056581914, -0.19576493, -0.1566389, 0.11269683, 0.12300568));
	target1 += mul(ne1, float4x4(-0.2722888, -0.23436427, -0.1575821, -0.48185775, 0.38314724, 0.34028763, -0.22216766, 0.007053101, 0.43936196, -0.106232345, 0.3447898, -0.23145236, 0.17801034, 0.107395455, -0.10301275, -0.37671486));
	target1 += mul(nf1, float4x4(0.027849002, 0.005493108, -0.07151169, -0.037706394, -0.030193258, 0.010484296, 0.12785462, 0.013065869, -0.02244137, 0.07738321, 0.1428445, 0.113706514, -0.016278854, -0.042375315, -0.014864632, 0.009536567));
	target1 += mul(ng1, float4x4(0.016663784, 0.05133444, -0.036621124, -0.008492059, 0.006291255, 0.11044035, -0.06309081, -0.069970004, -0.080658376, 0.09237095, -0.034645274, 0.008006193, 0.0027648888, -0.055031255, 0.03726407, 0.04109432));
	target1 += mul(nh1, float4x4(0.020757113, -0.037167564, 0.10940448, 0.10017512, -0.0557746, 0.068767264, 0.004170172, -0.13714421, -0.03749213, 0.06711421, -0.106826246, 0.11498757, 0.04753172, 0.10241908, 0.029435748, 0.07990668));
	target1 += mul(ni1, float4x4(0.006560853, 0.023810847, -0.024475241, -0.058117945, -0.035195846, -0.04271988, 0.049571864, 0.15042038, -0.007620832, -0.025544636, 0.008050735, 0.01056782, 0.003100258, 0.004679245, 0.006587324, 0.018110644));
	target1 += mul(na2, float4x4(-0.02249566, 0.007422409, 0.012279647, 0.010022545, 0.009818794, -0.0038862806, 0.0011564652, -0.0012341562, 0.025019286, -0.0007220492, 0.0124062635, -0.0023235283, 0.007858289, 0.005228454, -0.012827192, -0.007885503));
	target1 += mul(nb2, float4x4(-0.006161589, -0.00088863634, -0.023869669, 0.0018966346, -0.029431801, -0.02086368, -0.0028806294, -0.018974712, -0.061033156, 0.0062700063, -0.013323617, -0.002422788, 0.006092313, 0.023675537, 0.024895974, 0.028143935));
	target1 += mul(nc2, float4x4(-0.01571405, -0.011126691, -0.023274293, -0.011942278, 0.008145339, 0.002989056, -0.009912996, 0.003355017, 0.027048714, -0.0027421801, -0.017191814, 0.011846277, -0.0014130942, -0.007118815, -0.006929838, 0.0016808703));
	target1 += mul(nd2, float4x4(-0.008867632, -0.054621387, 0.00521757, -0.001299046, -0.011991457, 0.01253288, 0.007283377, 0.008240456, -0.0328741, 0.030364394, 0.020916186, 0.046377357, -0.0025963385, 0.01801948, 0.017200526, 0.00594781));
	target1 += mul(ne2, float4x4(0.049747035, -0.019024936, -0.037307423, -0.08635432, 0.06362242, -0.031690367, 0.0065028532, -0.008046128, -0.05511954, -0.10017574, -0.105036125, -0.07040073, 0.17360815, 0.01423494, 0.08344793, 0.034444667));
	target1 += mul(nf2, float4x4(0.017059349, 0.018542623, 0.08186985, 0.032335408, 0.025869634, 0.05591529, 0.06690499, 0.03575357, -0.012014303, 0.0105476575, 5.6129655e-05, -0.048691276, 0.018272987, 0.006864036, 0.07919666, 0.011271695));
	target1 += mul(ng2, float4x4(0.0083691375, 0.019432282, -0.0020216214, 0.010078488, -0.010552234, -0.018613037, -0.01894401, -0.019692581, -0.018510802, -0.05016945, -0.019306058, -0.026060628, -0.022509305, -0.026064832, -0.022758938, -0.0126465075));
	target1 += mul(nh2, float4x4(0.014892795, 0.05182727, 0.029640755, 0.042491425, 0.002331018, 0.049497634, 0.023474293, 0.03618418, 0.042020023, 0.022141129, -0.01769678, -0.05579617, 0.01911445, 0.12306055, -0.01590528, 0.034323514));
	target1 += mul(ni2, float4x4(0.006270243, 0.007303163, 0.00036148846, 0.03448912, -0.012123668, -0.009662251, -0.024768578, 0.011880113, 0.022290664, 0.020946119, 0.080712624, 0.07576267, -0.019421795, -0.0020419061, 0.018644858, 0.06175357));
	target1 += float4(-0.0010649486, -0.00016483334, -0.0012494534, -0.00068970193);

	float4 target2 = mul(a1, float4x4(0.0035857174, 0.0015660138, 0.020620639, 0.005492695, 0.001423522, -0.0054458505, 0.012705879, -0.0034037326, 0.0149765, 0.013593896, -0.010144564, -0.0033618324, -0.025044372, -0.015873099, 0.004778018, 0.0032093422));
	target2 += mul(b1, float4x4(-0.048154887, 0.0020999224, -0.025491469, 0.006954485, 0.033840816, -0.004235974, -0.0017972046, -0.016532846, 0.021911, -0.019167153, 0.028508998, -0.009081471, -0.007042987, 0.01162185, -0.04339448, 0.0007982697));
	target2 += mul(c1, float4x4(0.00692694, 0.0055899867, -0.023532946, -0.005534464, 0.0065109115, -0.0013627014, 0.030455293, -0.0042733895, 0.003067062, -0.0019513131, 0.006832627, -0.009393006, -0.000787669, 0.0077676126, 0.017214414, 0.0005746928));
	target2 += mul(d1, float4x4(0.04131343, 0.0061334246, 0.04663622, 0.034750223, 0.08184925, 0.054405987, 0.024695259, 0.049768113, 0.018208977, -0.008268458, -0.0025095541, -0.008479898, -0.024029268, -0.010577515, -0.005219355, 0.018835438));
	target2 += mul(e1, float4x4(-0.06885562, -0.093354285, -0.048095718, -0.09125787, 0.010425496, 0.05662435, 0.063850805, 0.052370857, 0.005560605, 0.10203871, 0.011031155, 0.0477262, -0.18752532, -0.11192383, -0.101294495, -0.109485805));
	target2 += mul(f1, float4x4(0.036398027, 0.014735432, 0.016239874, -0.01584611, -0.01778564, -0.004402638, -0.023574408, 0.02128348, -0.0046439893, -0.009485744, 0.0150541775, 0.042738233, 0.002258786, -0.0077040903, -0.10126805, -0.032423664));
	target2 += mul(g1, float4x4(-0.017738108, -0.005236546, 0.017476387, 0.024146654, -0.008857861, 0.030536365, 0.00078645826, -0.0040980624, 0.011337365, 0.02425144, 0.00035175736, -0.006825428, 0.0052930177, -0.0023696972, 0.005261922, -0.005584412));
	target2 += mul(h1, float4x4(9.1015834e-05, -0.025756188, -0.03374904, -0.04208741, -0.0063595036, -0.013884236, -0.01298973, 0.017781528, 0.00026187245, -0.03274113, 0.02486941, 0.028255679, 0.035963513, -0.0581295, 0.035199746, -0.0104088895));
	target2 += mul(i1, float4x4(-0.01260853, 0.0062392578, -0.003071281, 0.01035934, 0.0057190065, -0.0017513676, -0.0026357982, -0.017834812, 0.00678827, 0.008202837, -0.009278475, -0.020972013, 0.010203599, 0.001279875, 0.011321498, -0.041021187));
	target2 += mul(a2, float4x4(-0.016318664, 0.010274144, 0.0058704168, -0.0003722195, 0.08622261, -0.056748323, 0.012796814, -0.015498583, -0.014841128, -0.013023518, -0.017376468, -0.011344732, -0.0011321327, -0.004839438, -0.065831445, 0.0170905));
	target2 += mul(b2, float4x4(0.029771782, -0.029376734, 0.12636128, -0.041424155, -0.037272833, -0.028138392, 0.05147389, -0.071253724, -0.058589596, -0.0038131222, 0.07219576, -0.018886834, -0.03175935, 0.022464748, 0.04460918, 0.007712493));
	target2 += mul(c2, float4x4(0.031791184, 0.03750064, -0.04815924, -0.0037787687, 0.027883058, 0.0017699281, 0.00868531, -0.00061983714, 0.014328159, 0.011008469, -0.1027948, 0.061321545, -0.0232413, -0.0036074712, 0.013003957, 0.005047646));
	target2 += mul(d2, float4x4(0.061875354, 0.013966958, -0.057976823, -0.027396917, -0.07729206, 0.037510768, 0.05148819, 0.013505393, -0.009836167, 0.017979803, 0.014434765, 0.011236566, 0.038901612, 0.14647634, 0.023550952, -0.05943926));
	target2 += mul(e2, float4x4(-0.2863236, -0.16270663, 0.13858683, 0.38562158, -0.18246396, 0.30728486, -0.30314833, 0.23621106, -0.09020211, 0.39363787, -0.14761628, 0.27088004, 0.023005886, -0.10956146, -0.33344752, -0.20269877));
	target2 += mul(f2, float4x4(0.032677263, 0.04735152, 0.013321085, -0.07145408, -0.0017462671, -0.02075628, -0.005706266, 0.07966981, 0.003467664, -0.07971056, 0.053607278, 0.04101255, -0.035280924, -0.01813141, -0.044000223, 0.025542235));
	target2 += mul(g2, float4x4(-0.008210569, 0.019704876, -0.015133452, -0.03008762, 0.0049174884, 0.08061308, 0.0038117717, -0.002029503, 0.042190753, 0.032804105, 0.014783755, -0.002336826, 0.054137956, 0.005509952, 0.008481185, 0.009885271));
	target2 += mul(h2, float4x4(0.013837548, -0.07767153, 0.022879202, -0.056488827, 0.065868095, -0.18863344, 0.03467011, -0.010413688, 0.021961903, -0.081908144, 0.088906504, 0.03597804, -0.00019099779, 0.031379074, 0.058308717, -0.026305178));
	target2 += mul(i2, float4x4(0.0014572622, -0.011619552, -0.026434032, -0.0043500545, 0.047792483, -0.024726411, 0.06545127, -0.11275325, 0.013236977, -0.00022499474, -0.029280944, -0.06866851, 0.006890194, -0.011312297, 0.0012165548, -0.04664351));
	target2 += mul(na1, float4x4(-0.047752246, 0.0029728701, -0.05373465, -0.005998469, -0.22396794, -0.029485608, 0.09026242, 0.02158353, 0.109403, -0.05481592, 0.010041409, -0.04743197, 0.055757776, 0.02585569, 0.02746905, -0.0030442658));
	target2 += mul(nb1, float4x4(0.099528484, -0.015010706, 0.100571565, -0.0034939381, 0.012570407, -0.043869816, 0.086886376, 0.05987024, -0.22607432, 0.16655043, 0.061426442, 0.05697152, 0.064990446, -0.005531908, -0.14511855, 0.0076572066));
	target2 += mul(nc1, float4x4(-0.012711154, -0.003635479, -0.0075224554, -0.0052927425, 0.0028822776, 0.0072370963, -0.056621686, 0.0026998962, -0.03712505, 0.042426053, -0.22980946, 0.14261132, -0.03626141, -0.039392795, 0.085727766, 0.012577211));
	target2 += mul(nd1, float4x4(0.2062136, -0.19833934, 0.07369608, -0.09740325, -0.034611486, -0.32579082, 0.07641666, 0.15784128, -0.039162353, -0.07734512, -0.033959284, 0.084764875, -0.1607926, -0.11135721, 0.10387057, 0.1112244));
	target2 += mul(ne1, float4x4(0.042789772, 0.07384808, 0.13794817, -0.10090421, 0.22263366, 0.2233975, -0.42347354, -0.17161362, 0.277786, -0.30926275, 0.22161394, -0.37332773, 0.3831451, 0.37317204, 0.079320274, -0.17581266));
	target2 += mul(nf1, float4x4(-0.008462805, -0.017743077, 0.006631768, 0.02413771, -0.03139262, -0.016224122, 0.08141759, -0.057388183, -0.031753678, 0.048525933, 0.058724694, -0.0065176883, -0.022200955, -0.042803597, 0.038398843, 0.10740149));
	target2 += mul(ng1, float4x4(-0.02273882, 0.02042988, -0.04352726, -0.04365105, 0.031618606, 0.12397989, -0.0446275, -0.047745094, -0.07272708, 0.087981805, -0.022883464, 0.021518158, -0.033805974, -0.061570108, 0.029978301, 0.047310177));
	target2 += mul(nh1, float4x4(-0.024014544, 0.03312975, 0.013067503, 0.11282178, -0.024482794, 0.008091268, 0.06432778, -0.15766713, -0.048687667, 0.0051038596, -0.10650007, 0.074438654, -0.031920508, 0.07880885, -0.063074075, 0.053531222));
	target2 += mul(ni1, float4x4(-0.007602651, 0.003247358, -0.010531292, -0.014522784, -0.045127477, -0.04641428, 0.034592852, 0.11708857, 0.016080456, -0.010862374, 0.012204836, -0.019624028, -0.005596978, 0.002570303, -0.01718452, 0.007036096));
	target2 += mul(na2, float4x4(-0.03400744, 0.0074553913, 0.0043038665, 7.957602e-05, 0.014352309, -0.003674803, -0.014835324, -0.0055977562, 0.033462152, 0.0068732416, 0.0065420493, -0.0026703794, 0.011938421, 0.001983706, -0.0049862997, 0.00021165576));
	target2 += mul(nb2, float4x4(-0.0033091118, 0.018994803, -0.026634768, 0.020428555, -0.020792423, 0.009742637, 0.032050136, 0.0071278135, -0.052302815, 0.013190179, -0.004127084, 0.009925822, -0.009661492, -0.0073792427, 0.019004244, 0.00037218904));
	target2 += mul(nc2, float4x4(-0.009932352, -0.00995933, -0.02146786, -0.0038179306, -0.016922737, -0.000982855, -0.03880165, 0.0100428425, 0.018592497, -0.011936452, -0.00839573, 0.012914954, -0.0023291495, 0.0014392055, -0.013602821, -0.00075313775));
	target2 += mul(nd2, float4x4(0.005934442, -0.067556374, 0.022733796, -0.00048716593, -0.03944287, -0.0031405275, -0.021329157, -0.023225859, -0.048807576, 0.013591428, -0.0012470218, 0.012172492, -0.01362018, 0.01908229, -0.009989492, -0.013456593));
	target2 += mul(ne2, float4x4(0.0331533, -0.038016763, -0.024796762, -0.097570956, 0.040756926, -0.038999844, 0.004969716, 0.02203622, -0.07773537, -0.095957406, -0.13685504, -0.08225932, 0.21070166, 0.06672545, 0.12035284, 0.0874353));
	target2 += mul(nf2, float4x4(-0.011405352, 0.008514039, 0.04049351, 0.003270972, -0.01190376, -0.00082049094, 0.011982737, -0.03732464, -0.008393462, 0.019479843, -0.0030869786, -0.028394854, 0.008670484, 0.0026366632, 0.07985739, 0.020958235));
	target2 += mul(ng2, float4x4(0.009845008, 0.039659, -0.0024999548, 0.025727686, -0.008037673, -0.020821366, -0.016814344, -0.023053886, -0.0108242845, -0.03981645, -0.0041943905, -0.0074299327, -0.040454514, -0.05054933, -0.0098485295, -0.015317222));
	target2 += mul(nh2, float4x4(0.0010468375, 0.035403077, 0.008057769, 0.041717537, 0.01396972, 0.04731576, 0.025122687, 0.038469587, 0.0198318, -0.012237324, -0.030784154, -0.08214199, -0.0328875, 0.080247246, -0.06950492, -0.015201896));
	target2 += mul(ni2, float4x4(0.011373379, 0.00011919624, 0.0068450207, 0.0220856, 0.0008673803, -0.011897817, -0.0020893042, 0.0056429924, 0.021536274, 0.017342292, 0.06635433, 0.056875907, -0.03351322, -0.008332283, -0.015365816, 0.037228417));
	target2 += float4(-0.0009943815, -0.0009860148, -0.0010532598, -0.0012024855);

	float4 target3 = mul(a1, float4x4(-0.009554673, -0.0051532127, 0.016102737, 0.0012649148, 0.0077584987, -0.0057105157, 0.014400071, -0.003925339, 0.017135408, 0.013764417, -0.009086141, -0.0025339578, -0.03264511, -0.016280945, 0.00085838477, 0.0019880894));
	target3 += mul(b1, float4x4(-0.076804005, -0.0037365195, -0.050517682, 0.0020104242, 0.043787587, -0.0023654283, 0.008199321, -0.016990408, 0.042490408, -0.010710564, 0.040960148, -0.0044487948, -0.019470902, 0.010283459, -0.05382899, 0.0012054538));
	target3 += mul(c1, float4x4(0.005319574, 0.008546302, -0.037447132, -0.006825204, 0.012196145, 0.0007432002, 0.03959715, -0.00010419698, 0.010211025, -0.00066449976, 0.023840206, -0.0033524157, -0.003079352, 0.010155542, 0.008363695, 0.00091413554));
	target3 += mul(d1, float4x4(0.020189503, -0.01584555, 0.048491407, 0.03190471, 0.09146135, 0.066083826, 0.026668968, 0.0508916, 0.025873413, 9.346497e-05, -0.0008600848, -0.0056697717, -0.02856373, -0.023208767, -0.0029442043, 0.015996031));
	target3 += mul(e1, float4x4(-0.1150775, -0.14330095, -0.09502353, -0.13884564, 0.03178533, 0.07732762, 0.08540057, 0.07563651, 0.024709824, 0.122676425, 0.025598785, 0.062055748, -0.19433355, -0.13121647, -0.1121546, -0.12868516));
	target3 += mul(f1, float4x4(0.029061472, 0.0073350146, -0.009606754, -0.043715715, -0.018559197, -0.0032290102, -0.01473082, 0.030312086, 0.0055792383, 0.0023610878, 0.03392709, 0.0651179, -0.004296543, -0.016898727, -0.1092547, -0.05084782));
	target3 += mul(g1, float4x4(-0.016137538, -0.012792734, 0.017391363, 0.026128957, -0.010817838, 0.032218445, -0.0013944196, -0.004525187, 0.009840227, 0.027271513, -0.0024920607, -0.007501888, 0.009476934, 0.00032995836, 0.0041257427, -0.005625152));
	target3 += mul(h1, float4x4(0.0126353195, -0.03399586, -0.016907396, -0.04899062, -0.007695538, -0.00612431, -0.017332746, 0.021991903, -0.00045945324, -0.02436651, 0.022777557, 0.032261726, 0.050632916, -0.04296159, 0.049930036, 0.0025758578));
	target3 += mul(i1, float4x4(-0.017934766, 0.0015081838, -0.007909493, -0.00010294505, 0.014974485, 0.0058351695, 0.0057603214, -0.0069939885, 0.00735945, 0.011284126, -0.008253157, -0.013189189, 0.011712553, 0.004176325, 0.019821197, -0.031529877));
	target3 += mul(a2, float4x4(-0.01832641, 0.011409, 0.0062228036, 0.0012198171, 0.087020114, -0.048004452, 0.019594414, -0.008977235, -0.021853259, -0.025875412, -0.015291018, -0.017832348, -0.0019894738, -0.003139117, -0.05734562, 0.019568626));
	target3 += mul(b2, float4x4(0.019767432, -0.032121878, 0.10731837, -0.04160946, -0.038930662, -0.023509357, 0.04353874, -0.06393884, -0.07079979, -0.011297704, 0.055680044, -0.031674873, -0.02432217, 0.022975061, 0.044791207, 0.008831304));
	target3 += mul(c2, float4x4(0.026333796, 0.027133184, -0.05641698, -0.015003387, 0.036531202, 0.001104644, 0.017913498, 0.00035784056, 0.010045828, 0.009374881, -0.112320945, 0.055523388, -0.017043058, -0.0012098805, 0.01718199, 0.0069912164));
	target3 += mul(d2, float4x4(0.05305516, 0.0041708737, -0.05804445, -0.028278397, -0.07342492, 0.038176447, 0.045486677, 0.014904428, -0.0017618206, 0.02097501, 0.019868117, 0.019373845, 0.052739795, 0.14912929, 0.037314087, -0.041035987));
	target3 += mul(e2, float4x4(-0.2646953, -0.15668106, 0.12151133, 0.34568876, -0.16625103, 0.28626326, -0.2804781, 0.22248842, -0.09251715, 0.35097396, -0.13397448, 0.244039, 0.02513416, -0.09538499, -0.29833388, -0.17769372));
	target3 += mul(f2, float4x4(0.04070677, 0.052483205, 0.024028141, -0.06267387, 0.010479897, -0.006137579, 0.005726815, 0.084615536, 0.0029081039, -0.08204673, 0.040788963, 0.018049795, -0.031891953, -0.019043325, -0.041718785, 0.021308724));
	target3 += mul(g2, float4x4(-0.001575907, 0.022952983, -0.012997063, -0.02844319, 0.0009266049, 0.06976445, -0.0020330409, -0.008372886, 0.028383447, 0.028487694, 0.0039281887, -0.0070809238, 0.041292112, 0.0025322342, -0.002103223, 0.0054210713));
	target3 += mul(h2, float4x4(0.024716897, -0.05894056, 0.033999596, -0.044356048, 0.063092224, -0.18046258, 0.033589583, -0.014151911, 0.015899418, -0.07901728, 0.073378325, 0.03326658, 0.0032687644, 0.035330176, 0.05457883, -0.017820554));
	target3 += mul(i2, float4x4(0.001996882, -0.004963775, -0.020151457, 0.0057843816, 0.040995013, -0.019961083, 0.05682041, -0.10547617, 0.02503235, 0.0036294905, -0.013947642, -0.060715917, 0.01876786, 0.0032777768, 0.01927007, -0.025088508));
	target3 += mul(na1, float4x4(-0.029518202, 0.0065702717, -0.04704605, -0.0048227045, -0.22110744, -0.029372428, 0.08053116, 0.018663822, 0.105733156, -0.04344413, 0.012286602, -0.038484503, 0.060564645, 0.02851022, 0.032352086, -0.001444222));
	target3 += mul(nb1, float4x4(0.11888213, -0.007396298, 0.12236374, 0.0014036719, 0.006478195, -0.039041974, 0.07476124, 0.05907462, -0.2318871, 0.15429306, 0.048975263, 0.058071293, 0.07446568, -0.0038836622, -0.12506978, 0.006849885));
	target3 += mul(nc1, float4x4(-0.007815376, -0.0046987617, 0.0045718704, -0.00088239316, -0.006927552, -0.0014817615, -0.060644537, -0.001377785, -0.052053373, 0.030922025, -0.24218674, 0.121679045, -0.025974795, -0.033550926, 0.09225413, 0.015534639));
	target3 += mul(nd1, float4x4(0.21073256, -0.14790624, 0.06557328, -0.07896083, -0.048564, -0.32713607, 0.070434004, 0.1443138, -0.046067085, -0.08069691, -0.030934528, 0.0787659, -0.1532774, -0.10070167, 0.088510044, 0.10279543));
	target3 += mul(ne1, float4x4(0.08564646, 0.11520261, 0.17513204, -0.039576255, 0.17702763, 0.18044637, -0.434279, -0.1977341, 0.22995597, -0.3259109, 0.17992231, -0.37692067, 0.376483, 0.3755724, 0.09422753, -0.13918276));
	target3 += mul(nf1, float4x4(-0.0013641209, -0.0061978037, 0.030735753, 0.04674489, -0.01888518, -0.0114997085, 0.072051995, -0.059909277, -0.029514287, 0.037226655, 0.03950886, -0.030156244, -0.017026234, -0.029231733, 0.049310546, 0.12158501));
	target3 += mul(ng1, float4x4(-0.01796674, 0.021739075, -0.035811454, -0.04138176, 0.032448296, 0.11651916, -0.03788447, -0.03756297, -0.06638623, 0.07866896, -0.020593246, 0.021115394, -0.03838543, -0.065531924, 0.024438148, 0.038348224));
	target3 += mul(nh1, float4x4(-0.032326736, 0.041176587, -0.0014080614, 0.110470615, -0.030802252, -0.012258527, 0.059453994, -0.16016562, -0.03715787, -0.0005157213, -0.08924785, 0.06535089, -0.043620374, 0.06014967, -0.07529656, 0.03737166));
	target3 += mul(ni1, float4x4(-0.005197623, 0.0025410433, -0.0057371682, -0.005702406, -0.047433604, -0.045016363, 0.022881668, 0.097284265, 0.0050103525, -0.017835459, 0.002082661, -0.031085419, -0.0127113415, -0.010034892, -0.029547364, -0.008159356));
	target3 += mul(na2, float4x4(-0.030834803, 0.0057913456, 0.0045642396, -0.0023247486, 0.013091116, -0.0019978182, -0.016291631, -0.0063495245, 0.03679821, 0.0111187175, 0.006293907, -0.00159841, 0.008976987, 0.0016699543, -0.002045872, 0.0021225133));
	target3 += mul(nb2, float4x4(-0.0015449662, 0.018941572, -0.0231217, 0.019913983, -0.02159856, 0.011715352, 0.03145841, 0.011557888, -0.04381463, 0.016269097, 0.003801907, 0.017497942, -0.01846834, -0.011969666, 0.0077923136, -0.00425442));
	target3 += mul(nc2, float4x4(-0.006369402, -0.0054975, -0.016373185, 0.0015303852, -0.018976817, -0.0020408076, -0.04186448, 0.00862744, 0.013957444, -0.016489068, -0.00876064, 0.0072258003, -0.0065470496, -0.002731135, -0.01953136, -0.0069286725));
	target3 += mul(nd2, float4x4(0.01150196, -0.057684112, 0.025627816, 0.0045186444, -0.043896675, -0.0073078806, -0.024036214, -0.025075085, -0.041793358, 0.014898103, -0.0012467141, 0.009073226, -0.025522837, 0.0077871215, -0.016588857, -0.016728807));
	target3 += mul(ne2, float4x4(0.03427146, -0.03154995, -0.014622754, -0.083448716, 0.037077904, -0.042196143, 0.0002654827, 0.016835919, -0.05883882, -0.07723022, -0.12151369, -0.06892644, 0.18169776, 0.04656827, 0.09332061, 0.06265968));
	target3 += mul(nf2, float4x4(-0.020037629, 0.0017381558, 0.028106472, -0.0037192027, -0.015639286, -0.0048114993, 0.006957652, -0.04234011, -0.0035926187, 0.02475383, 0.009671758, -0.013367782, 0.009844827, 0.004127156, 0.072107606, 0.018165117));
	target3 += mul(ng2, float4x4(0.0024197982, 0.034167156, -0.00615297, 0.022378683, -0.005956443, -0.0206202, -0.014193571, -0.0220195, -0.004417834, -0.030854845, 0.0032532495, -0.0012370368, -0.031531993, -0.049196836, -0.0011905541, -0.011573349));
	target3 += mul(nh2, float4x4(-0.012435409, 0.023669207, -0.007541224, 0.030453008, 0.009036608, 0.04195238, 0.019962423, 0.033735633, 0.017467575, -0.008318013, -0.0268461, -0.07455821, -0.035670403, 0.06528855, -0.064557284, -0.022101114));
	target3 += mul(ni2, float4x4(0.011575822, -0.0029453707, 0.0029919853, 0.014933897, -0.00034664775, -0.015719024, -0.002594161, 0.0012937526, 0.006969654, 0.008678383, 0.04876611, 0.049061276, -0.037455864, -0.013019295, -0.02504076, 0.024031559));
	target3 += float4(-0.00052569207, -0.0009582512, -0.00071018364, -0.0011515211);

	float2 outputPt = GetOutputPt();

	pos -= 0.5f * outputPt;
	OUTPUT[gxy] = float4(float3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);

	++gxy.x;
	pos.x += outputPt.x;
	OUTPUT[gxy] = float4(float3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);

	++gxy.y;
	pos.y += outputPt.y;
	OUTPUT[gxy] = float4(float3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);

	--gxy.x;
	pos.x -= outputPt.x;
	OUTPUT[gxy] = float4(float3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
}
